<!DOCTYPE html>
<html>
<head><meta name="generator" content="Hexo 3.9.0">
    

    

    



    <meta charset="utf-8">
    
    
    
    
    <title>Java爬虫之Jsoup | 欢迎参观小灰灰的网站哟 ヾ(◍°∇°◍)ﾉﾞ ~ | It&#39;s founded on March 9, 2019 and the open source address for the blog notes https://github.com/YUbuntu0109/YUbuntu0109.github.io</title>
    <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
    
    <meta name="theme-color" content="#3F51B5">
    
    
    <meta name="keywords" content="Java,Jsoup">
    <meta name="description" content="学习笔记 : Java爬虫之Jsoup(Java HTML Parser)jsoup is a Java library for working with real-world HTML. It provides a very convenient API for extracting and manipulating data, using the best of DOM, CSS, and j">
<meta name="keywords" content="Java,Jsoup">
<meta property="og:type" content="article">
<meta property="og:title" content="Java爬虫之Jsoup">
<meta property="og:url" content="http://yoursite.com/2019/07/10/Java爬虫之Jsoup/index.html">
<meta property="og:site_name" content="欢迎参观小灰灰的网站哟 ヾ(◍°∇°◍)ﾉﾞ ~">
<meta property="og:description" content="学习笔记 : Java爬虫之Jsoup(Java HTML Parser)jsoup is a Java library for working with real-world HTML. It provides a very convenient API for extracting and manipulating data, using the best of DOM, CSS, and j">
<meta property="og:locale" content="en">
<meta property="og:updated_time" content="2019-10-31T05:19:50.621Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Java爬虫之Jsoup">
<meta name="twitter:description" content="学习笔记 : Java爬虫之Jsoup(Java HTML Parser)jsoup is a Java library for working with real-world HTML. It provides a very convenient API for extracting and manipulating data, using the best of DOM, CSS, and j">
    
        <link rel="alternate" type="application/atom+xml" title="欢迎参观小灰灰的网站哟 ヾ(◍°∇°◍)ﾉﾞ ~" href="/atom.xml">
    
    <link rel="shortcut icon" href="/favicon.ico">
    <link rel="stylesheet" href="//unpkg.com/hexo-theme-material-indigo@latest/css/style.css">
    <script>window.lazyScripts=[]</script>

    <!-- custom head -->
    

</head>

<body>
    <div id="loading" class="active"></div>

    <aside id="menu" class="hide" >
  <div class="inner flex-row-vertical">
    <a href="javascript:;" class="header-icon waves-effect waves-circle waves-light" id="menu-off">
        <i class="icon icon-lg icon-close"></i>
    </a>
    <div class="brand-wrap" style="background-image:url(/img/brand.jpg)">
      <div class="brand">
        <a href="/" class="avatar waves-effect waves-circle waves-light">
          <img src="/img/my-portrait.jpg">
        </a>
        <hgroup class="introduce">
          <h5 class="nickname">黄宇辉</h5>
          <a href="mailto:3083968068@qq.com" title="3083968068@qq.com" class="mail">3083968068@qq.com</a>
        </hgroup>
      </div>
    </div>
    <div class="scroll-wrap flex-col">
      <ul class="nav">
        
            <li class="waves-block waves-effect">
              <a href="/"  >
                <i class="icon icon-lg icon-home"></i>
                homepage
              </a>
            </li>
        
            <li class="waves-block waves-effect">
              <a href="/archives"  >
                <i class="icon icon-lg icon-archives"></i>
                Archives
              </a>
            </li>
        
            <li class="waves-block waves-effect">
              <a href="/tags"  >
                <i class="icon icon-lg icon-tags"></i>
                Tags
              </a>
            </li>
        
            <li class="waves-block waves-effect">
              <a href="/categories"  >
                <i class="icon icon-lg icon-th-list"></i>
                Categories
              </a>
            </li>
        
            <li class="waves-block waves-effect">
              <a href="https://github.com/YUbuntu0109" target="_blank" >
                <i class="icon icon-lg icon-github"></i>
                Github
              </a>
            </li>
        
            <li class="waves-block waves-effect">
              <a href="https://github.com/YUbuntu0109" target="_blank" >
                <i class="icon icon-lg icon-weibo"></i>
                Weibo
              </a>
            </li>
        
            <li class="waves-block waves-effect">
              <a href="/custom"  >
                <i class="icon icon-lg icon-link"></i>
                Test
              </a>
            </li>
        
      </ul>
    </div>
  </div>
</aside>

    <main id="main">
        <header class="top-header" id="header">
    <div class="flex-row">
        <a href="javascript:;" class="header-icon waves-effect waves-circle waves-light on" id="menu-toggle">
          <i class="icon icon-lg icon-navicon"></i>
        </a>
        <div class="flex-col header-title ellipsis">Java爬虫之Jsoup</div>
        
        <div class="search-wrap" id="search-wrap">
            <a href="javascript:;" class="header-icon waves-effect waves-circle waves-light" id="back">
                <i class="icon icon-lg icon-chevron-left"></i>
            </a>
            <input type="text" id="key" class="search-input" autocomplete="off" placeholder="Search">
            <a href="javascript:;" class="header-icon waves-effect waves-circle waves-light" id="search">
                <i class="icon icon-lg icon-search"></i>
            </a>
        </div>
        
        
        <a href="javascript:;" class="header-icon waves-effect waves-circle waves-light" id="menuShare">
            <i class="icon icon-lg icon-share-alt"></i>
        </a>
        

        <!-- background music(Mar 11,2019 AM) -->
        <div>
            <iframe frameborder="no" border="0" marginwidth="0" marginheight="0" width=280 height=52 src="//music.163.com/outchain/player?type=2&id=438801642&auto=1&height=32"></iframe>
        </div>
        <!---------------------->
    </div>
</header>
<header class="content-header post-header">

    <div class="container fade-scale">
        <h1 class="title">Java爬虫之Jsoup</h1>
        <h5 class="subtitle">
            
                <time datetime="2019-07-10T18:55:14.000Z" itemprop="datePublished" class="page-time">
  2019-07-10
</time>


            
        </h5>
    </div>

    


</header>


<div class="container body-wrap">
    
    <aside class="post-widget">
        <nav class="post-toc-wrap post-toc-shrink" id="post-toc">
            <h4>TOC</h4>
            <ol class="post-toc"><li class="post-toc-item post-toc-level-2"><a class="post-toc-link" href="#学习笔记-Java爬虫之Jsoup-Java-HTML-Parser"><span class="post-toc-number">1.</span> <span class="post-toc-text">学习笔记 : Java爬虫之Jsoup(Java HTML Parser)</span></a><ol class="post-toc-child"><li class="post-toc-item post-toc-level-3"><a class="post-toc-link" href="#爬虫实现"><span class="post-toc-number">1.1.</span> <span class="post-toc-text">爬虫实现</span></a></li><li class="post-toc-item post-toc-level-3"><a class="post-toc-link" href="#Jsoup-DOM"><span class="post-toc-number">1.2.</span> <span class="post-toc-text">Jsoup DOM</span></a></li><li class="post-toc-item post-toc-level-3"><a class="post-toc-link" href="#Jsoup-Selector"><span class="post-toc-number">1.3.</span> <span class="post-toc-text">Jsoup Selector</span></a></li></ol></li></ol>
        </nav>
    </aside>


<article id="post-Java爬虫之Jsoup"
  class="post-article article-type-post fade" itemprop="blogPost">

    <div class="post-card">
        <h1 class="post-card-title">Java爬虫之Jsoup</h1>
        <div class="post-meta">
            <time class="post-time" title="2019-07-10 18:55:14" datetime="2019-07-10T18:55:14.000Z"  itemprop="datePublished">2019-07-10</time>

            


            

        </div>
        <div class="post-content" id="post-content" itemprop="postContent">
            <h2 id="学习笔记-Java爬虫之Jsoup-Java-HTML-Parser"><a href="#学习笔记-Java爬虫之Jsoup-Java-HTML-Parser" class="headerlink" title="学习笔记 : Java爬虫之Jsoup(Java HTML Parser)"></a>学习笔记 : Java爬虫之Jsoup(Java HTML Parser)</h2><p><em>jsoup is a Java library for working with real-world HTML. It provides a very convenient API for extracting and manipulating data, using the best of DOM, CSS, and jquery-like methods.</em></p>
<p><em>👍 jsoup Cookbook : <a href="https://www.open-open.com/jsoup/" target="_blank" rel="noopener">https://www.open-open.com/jsoup/</a></em></p>
<h3 id="爬虫实现"><a href="#爬虫实现" class="headerlink" title="爬虫实现"></a>爬虫实现</h3><p><em>使用Jsoup解析不同数据的方式,示例程序如下 :</em><br><figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">package</span> pers.huangyuhui.crawler.Jsoup_Demo;</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> org.apache.commons.io.FileUtils;</span><br><span class="line"><span class="keyword">import</span> org.jsoup.Jsoup;</span><br><span class="line"><span class="keyword">import</span> org.jsoup.nodes.Document;</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> java.io.File;</span><br><span class="line"><span class="keyword">import</span> java.io.IOException;</span><br><span class="line"></span><br><span class="line"><span class="comment">/**</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@project</span>: crawler_learning</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@description</span>: 学习使用Jsoup</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@author</span>: 黄宇辉</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@date</span>: 7/8/2019-9:34 PM</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@version</span>: 1.0</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@website</span>: https://yubuntu0109.github.io/</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">class</span> <span class="title">JsoupTest</span> </span>&#123;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> <span class="keyword">void</span> <span class="title">main</span><span class="params">(String[] args)</span> <span class="keyword">throws</span> IOException </span>&#123;</span><br><span class="line">        urlTest();</span><br><span class="line">        stringTest();</span><br><span class="line">        fileTest();</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="comment">// <span class="doctag">TODO:</span> 7/8/2019 解析URL:获取网页title标签内容</span></span><br><span class="line">    <span class="function"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">void</span> <span class="title">urlTest</span><span class="params">()</span> <span class="keyword">throws</span> IOException </span>&#123;</span><br><span class="line"></span><br><span class="line">        <span class="comment">//解析URL地址并设置请求信息,获取Document对象</span></span><br><span class="line">        Document document = Jsoup.connect(<span class="string">"https://www.bilibili.com/"</span>)</span><br><span class="line">                .timeout(<span class="number">1000</span>)</span><br><span class="line">                .userAgent(<span class="string">"x-x-x-x-x-x"</span>)</span><br><span class="line">                .get();</span><br><span class="line"></span><br><span class="line">        System.out.println(document.title());</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="comment">// <span class="doctag">TODO:</span> 7/9/2019 解析字符串:获取字符串中title标签内容</span></span><br><span class="line">    <span class="function"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">void</span> <span class="title">stringTest</span><span class="params">()</span> <span class="keyword">throws</span> IOException </span>&#123;</span><br><span class="line">        <span class="comment">//使用文件工具类读取文件,获取字符串</span></span><br><span class="line">        String content = FileUtils.readFileToString(<span class="keyword">new</span> File(<span class="string">"C:/Users/Administrator/Desktop/html.html"</span>), <span class="string">"gbk"</span>);</span><br><span class="line">        <span class="comment">//解析字符串</span></span><br><span class="line">        Document doc = Jsoup.parse(content);</span><br><span class="line">        System.out.println(doc.getElementsByTag(<span class="string">"title"</span>).first().text());</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="comment">// <span class="doctag">TODO:</span> 7/9/2019 解析文件:获取文件中title标签内容</span></span><br><span class="line">    <span class="function"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">void</span> <span class="title">fileTest</span><span class="params">()</span> <span class="keyword">throws</span> IOException </span>&#123;</span><br><span class="line">        <span class="comment">//解析文件</span></span><br><span class="line">        Document doc = Jsoup.parse(<span class="keyword">new</span> File(<span class="string">"C:/Users/Administrator/Desktop/html.html"</span>), <span class="string">"gbk"</span>);</span><br><span class="line">        System.out.println(doc.title());</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure></p>
<h3 id="Jsoup-DOM"><a href="#Jsoup-DOM" class="headerlink" title="Jsoup DOM"></a>Jsoup DOM</h3><p><em>学习使用Jsoup-DOM获取元素及其中数据,示例程序如下 :</em><br><figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">package</span> pers.huangyuhui.crawler.Jsoup_DOM;</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> org.jsoup.Jsoup;</span><br><span class="line"><span class="keyword">import</span> org.jsoup.nodes.Document;</span><br><span class="line"><span class="keyword">import</span> org.jsoup.nodes.Element;</span><br><span class="line"><span class="keyword">import</span> org.jsoup.select.Elements;</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> java.io.IOException;</span><br><span class="line"></span><br><span class="line"><span class="comment">/**</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@project</span>: crawler_learning</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@description</span>: 学习使用Jsoup-DOM获取元素及其中数据</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@author</span>: 黄宇辉</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@date</span>: 7/9/2019-8:18 AM</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@version</span>: 1.0</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@website</span>: https://yubuntu0109.github.io/</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">class</span> <span class="title">ElementTest</span> </span>&#123;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> <span class="keyword">void</span> <span class="title">main</span><span class="params">(String[] args)</span> </span>&#123;</span><br><span class="line">        <span class="keyword">try</span> &#123;</span><br><span class="line">            <span class="comment">/*</span></span><br><span class="line"><span class="comment">                解析url并设置请求信息,获取Document对象</span></span><br><span class="line"><span class="comment">             */</span></span><br><span class="line">            Document doc = Jsoup.connect(<span class="string">"https://yubuntu0109.github.io/"</span>)</span><br><span class="line">                    .timeout(<span class="number">5000</span>)</span><br><span class="line">                    .userAgent(<span class="string">"x-x-x-x-x-x"</span>)</span><br><span class="line">                    .get();</span><br><span class="line"></span><br><span class="line">            <span class="comment">/*</span></span><br><span class="line"><span class="comment">                根据Document对象获取元素</span></span><br><span class="line"><span class="comment">             */</span></span><br><span class="line">            <span class="comment">//1:通过id获取元素:获取博客头标题</span></span><br><span class="line">            Element element = doc.getElementById(<span class="string">"header"</span>);</span><br><span class="line">            System.out.println(element.text());</span><br><span class="line"></span><br><span class="line">            <span class="comment">//2:通过tag获取元素:获取博客文章标题</span></span><br><span class="line">            Elements elements = doc.getElementsByTag(<span class="string">"h3"</span>);</span><br><span class="line">            <span class="keyword">for</span> (Element e : elements) &#123;</span><br><span class="line">                System.out.println(e.text());</span><br><span class="line">            &#125;</span><br><span class="line">            <span class="comment">//3:通过class获取元素:获取博客文章简介</span></span><br><span class="line">            Elements elements2 = doc.getElementsByClass(<span class="string">"post-content"</span>);</span><br><span class="line">            <span class="keyword">for</span> (Element e : elements2) &#123;</span><br><span class="line">                System.out.println(e.text() + <span class="string">"\n"</span>);</span><br><span class="line">            &#125;</span><br><span class="line">            <span class="comment">//4:通过attribute获取元素:获取博客文章发布时间</span></span><br><span class="line">            Elements elements3 = doc.getElementsByAttribute(<span class="string">"datetime"</span>);</span><br><span class="line">            <span class="keyword">for</span> (Element e : elements3) &#123;</span><br><span class="line">                System.out.println(e.text());</span><br><span class="line">            &#125;</span><br><span class="line">            <span class="comment">//4-2:通过属性名加属性值筛选元素:获取博客文章标题</span></span><br><span class="line">            Elements elements4 = doc.getElementsByAttributeValue(<span class="string">"itemprop"</span>, <span class="string">"name"</span>);</span><br><span class="line">            <span class="keyword">for</span> (Element e : elements4) &#123;</span><br><span class="line">                System.out.println(e.text());</span><br><span class="line">            &#125;</span><br><span class="line"></span><br><span class="line"></span><br><span class="line">            <span class="comment">/*</span></span><br><span class="line"><span class="comment">                获取元素中的数据</span></span><br><span class="line"><span class="comment">             */</span></span><br><span class="line">            Element e = doc.getElementById(<span class="string">"header"</span>);</span><br><span class="line">            <span class="comment">//1:从元素中获取id</span></span><br><span class="line">            System.out.println(e.id());</span><br><span class="line">            <span class="comment">//2:从元素中获取文本内容text</span></span><br><span class="line">            System.out.println(e.text());</span><br><span class="line">            <span class="comment">//3:从元素中获取className</span></span><br><span class="line">            System.out.println(e.className());</span><br><span class="line">            <span class="comment">//4:从元素中获取属性的值attr</span></span><br><span class="line">            System.out.println(e.attr(<span class="string">"id"</span>));</span><br><span class="line">            <span class="comment">//5:从元素中获取所有属性attributes</span></span><br><span class="line">            System.out.println(e.attributes().toString());</span><br><span class="line">            <span class="comment">// ······</span></span><br><span class="line"></span><br><span class="line">        &#125; <span class="keyword">catch</span> (IOException e) &#123;</span><br><span class="line">            e.printStackTrace();</span><br><span class="line">        &#125;</span><br><span class="line">    &#125;</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure></p>
<h3 id="Jsoup-Selector"><a href="#Jsoup-Selector" class="headerlink" title="Jsoup Selector"></a>Jsoup Selector</h3><p><em>学习使用Jsoup选择器,示例程序如下 :</em><br><figure class="highlight java"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">package</span> pers.huangyuhui.crawler.Jsoup_Selector;</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> org.jsoup.Jsoup;</span><br><span class="line"><span class="keyword">import</span> org.jsoup.nodes.Document;</span><br><span class="line"><span class="keyword">import</span> org.jsoup.nodes.Element;</span><br><span class="line"><span class="keyword">import</span> org.jsoup.select.Elements;</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> java.io.IOException;</span><br><span class="line"></span><br><span class="line"><span class="comment">/**</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@project</span>: crawler_learning</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@description</span>: 学习使用Jsoup选择器</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@author</span>: 黄宇辉</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@date</span>: 7/9/2019-10:29 AM</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@version</span>: 1.0</span></span><br><span class="line"><span class="comment"> * <span class="doctag">@website</span>: https://yubuntu0109.github.io/</span></span><br><span class="line"><span class="comment"> */</span></span><br><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">class</span> <span class="title">SelectorTest</span> </span>&#123;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> <span class="keyword">void</span> <span class="title">main</span><span class="params">(String[] args)</span> <span class="keyword">throws</span> IOException </span>&#123;</span><br><span class="line">        <span class="comment">/*</span></span><br><span class="line"><span class="comment">            解析url并设置请求信息,获取Document对象</span></span><br><span class="line"><span class="comment">         */</span></span><br><span class="line">        Document doc = Jsoup.connect(<span class="string">"https://yubuntu0109.github.io/"</span>)</span><br><span class="line">                .timeout(<span class="number">5000</span>)</span><br><span class="line">                .userAgent(<span class="string">"x-x-x-x-x-x"</span>)</span><br><span class="line">                .get();</span><br><span class="line"></span><br><span class="line">        <span class="comment">/*</span></span><br><span class="line"><span class="comment">            Jsoup选择器的使用方式</span></span><br><span class="line"><span class="comment">         */</span></span><br><span class="line">        <span class="comment">//el#id : 获取博客头标题</span></span><br><span class="line">        System.out.println(doc.select(<span class="string">"header#header"</span>).first().text());</span><br><span class="line"></span><br><span class="line">        <span class="comment">//el.class : 获取博客的导航标签</span></span><br><span class="line">        Elements elements = doc.select(<span class="string">"ul.nav"</span>);</span><br><span class="line">        <span class="keyword">for</span> (Element e : elements) &#123;</span><br><span class="line">            System.out.println(e.text());</span><br><span class="line">        &#125;</span><br><span class="line"></span><br><span class="line">        <span class="comment">//el[attr] : 获取博客文章标题</span></span><br><span class="line">        Elements elements2 = doc.select(<span class="string">"h3[itemprop]"</span>);</span><br><span class="line">        <span class="keyword">for</span> (Element e : elements2) &#123;</span><br><span class="line">            System.out.println(e.text());</span><br><span class="line">        &#125;</span><br><span class="line"></span><br><span class="line">        <span class="comment">//.ancestor child : 查询某个元素的下个子元素,获取博客导航标签</span></span><br><span class="line">        Elements elements3 = doc.select(<span class="string">".nav li"</span>);</span><br><span class="line">        <span class="keyword">for</span> (Element e : elements3) &#123;</span><br><span class="line">            System.out.println(e.text());</span><br><span class="line">        &#125;</span><br><span class="line"></span><br><span class="line">        <span class="comment">//parent &gt; child : 查询某个父元素下的直接子元素,获取博客文章链接的href值</span></span><br><span class="line">        Elements elements4 = doc.select(<span class="string">".post-title &gt; a"</span>);</span><br><span class="line">        <span class="keyword">for</span> (Element e : elements4) &#123;</span><br><span class="line">            System.out.println(e.attr(<span class="string">"href"</span>));</span><br><span class="line">        &#125;</span><br><span class="line"></span><br><span class="line">        <span class="comment">//parenet &gt; * : 查询某个父元素下所有直接子元素,获取博客文章标题</span></span><br><span class="line">        Elements elements5 = doc.select(<span class="string">".post-title &gt; *"</span>);</span><br><span class="line">        <span class="keyword">for</span> (Element e : elements5) &#123;</span><br><span class="line">            System.out.println(e.text());</span><br><span class="line">        &#125;</span><br><span class="line"></span><br><span class="line">    &#125;</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure></p>

        </div>

        <blockquote class="post-copyright">
    
    <div class="content">
        
<span class="post-time">
    Last updated: <time datetime="2019-10-31T05:19:50.621Z" itemprop="dateUpdated">2019-10-31 05:19:50</time>
</span><br>


        
    </div>
    
    <footer>
        <a href="http://yoursite.com">
            <img src="/img/my-portrait.jpg" alt="黄宇辉">
            黄宇辉
        </a>
    </footer>
</blockquote>

        
<div class="page-reward">
    <a id="rewardBtn" href="javascript:;" class="page-reward-btn waves-effect waves-circle waves-light">赏</a>
</div>



        <div class="post-footer">
            
	<ul class="article-tag-list"><li class="article-tag-list-item"><a class="article-tag-list-link" href="/tags/Java/">Java</a></li><li class="article-tag-list-item"><a class="article-tag-list-link" href="/tags/Jsoup/">Jsoup</a></li></ul>


            
<div class="page-share-wrap">
    

<div class="page-share" id="pageShare">
    <ul class="reset share-icons">
      <li>
        <a class="weibo share-sns" target="_blank" href="http://service.weibo.com/share/share.php?url=http://yoursite.com/2019/07/10/Java爬虫之Jsoup/&title=《Java爬虫之Jsoup》 — 欢迎参观小灰灰的网站哟 ヾ(◍°∇°◍)ﾉﾞ ~&pic=http://yoursite.com/img/my-portrait.jpg" data-title="微博">
          <i class="icon icon-weibo"></i>
        </a>
      </li>
      <li>
        <a class="weixin share-sns wxFab" href="javascript:;" data-title="微信">
          <i class="icon icon-weixin"></i>
        </a>
      </li>
      <li>
        <a class="qq share-sns" target="_blank" href="http://connect.qq.com/widget/shareqq/index.html?url=http://yoursite.com/2019/07/10/Java爬虫之Jsoup/&title=《Java爬虫之Jsoup》 — 欢迎参观小灰灰的网站哟 ヾ(◍°∇°◍)ﾉﾞ ~&source=My Personal Website For Blog" data-title=" QQ">
          <i class="icon icon-qq"></i>
        </a>
      </li>
      <li>
        <a class="facebook share-sns" target="_blank" href="https://www.facebook.com/sharer/sharer.php?u=http://yoursite.com/2019/07/10/Java爬虫之Jsoup/" data-title=" Facebook">
          <i class="icon icon-facebook"></i>
        </a>
      </li>
      <li>
        <a class="twitter share-sns" target="_blank" href="https://twitter.com/intent/tweet?text=《Java爬虫之Jsoup》 — 欢迎参观小灰灰的网站哟 ヾ(◍°∇°◍)ﾉﾞ ~&url=http://yoursite.com/2019/07/10/Java爬虫之Jsoup/&via=http://yoursite.com" data-title=" Twitter">
          <i class="icon icon-twitter"></i>
        </a>
      </li>
      <li>
        <a class="google share-sns" target="_blank" href="https://plus.google.com/share?url=http://yoursite.com/2019/07/10/Java爬虫之Jsoup/" data-title=" Google+">
          <i class="icon icon-google-plus"></i>
        </a>
      </li>
    </ul>
 </div>



    <a href="javascript:;" id="shareFab" class="page-share-fab waves-effect waves-circle">
        <i class="icon icon-share-alt icon-lg"></i>
    </a>
</div>



        </div>
    </div>

    
<nav class="post-nav flex-row flex-justify-between">
  
    <div class="waves-block waves-effect prev">
      <a href="/2019/07/12/Java爬虫之WebMagic/" id="post-prev" class="post-nav-link">
        <div class="tips"><i class="icon icon-angle-left icon-lg icon-pr"></i> Prev</div>
        <h4 class="title">Java爬虫之WebMagic</h4>
      </a>
    </div>
  

  
    <div class="waves-block waves-effect next">
      <a href="/2019/07/10/Java爬虫之HttpClient/" id="post-next" class="post-nav-link">
        <div class="tips">Next <i class="icon icon-angle-right icon-lg icon-pl"></i></div>
        <h4 class="title">Java爬虫之HttpClient</h4>
      </a>
    </div>
  
</nav>



    




















</article>

<div id="reward" class="page-modal reward-lay">
    <a class="close" href="javascript:;"><i class="icon icon-close"></i></a>
    <h3 class="reward-title">
        <i class="icon icon-quote-left"></i>
        thanks ~
        <i class="icon icon-quote-right"></i>
    </h3>
    <div class="reward-content">
        
        <div class="reward-code">
            <img id="rewardCode" src="/img/Wechat_appreciates.png" alt="打赏二维码">
        </div>
        
    </div>
</div>



</div>

        <footer class="footer">
    <div class="top">
        

        <p>
            
                <span><a href="/atom.xml" target="_blank" class="rss" title="rss"><i class="icon icon-lg icon-rss"></i></a></span>
            
            <span>This blog is licensed under a <a rel="license" href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</a>.</span>
        </p>
    </div>
    <div class="bottom">
        <!-- 统计网站用户访问量. 技术支持：不蒜子(http://busuanzi.ibruce.info/) ————> Mar 13,2019 -->
        <p>
            <font style='font-size: 12px;color:springgreen'>
                    <div align="center">
                        <!-- 安装脚本 -->
                        <script async src="//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>
                        <!-- 安装标签 -->
                        <span id="busuanzi_container_site_pv">
                            ◎用户总访问量 : <span id="busuanzi_value_site_pv"></span> 次 ~ &nbsp&nbsp
                        </span>
                        <span id="busuanzi_container_site_uv">
                            ◎总访客数(（づ￣3￣）づ╭❤～) : <span id="busuanzi_value_site_uv"></span>人 ~
                        </span>
                    </div>
                </font>
            </p>
            <!---------->
            <p>
                <font style='font-size: 10px'>
                    <span>黄宇辉 &copy; 2019</span>
                    <span>
                        
                        Blog source <a href="https://github.com/YUbuntu0109/YUbuntu0109.github.io" target="_blank">Github</a> 
                        Power by <a href="http://hexo.io/" target="_blank">Hexo</a>
                        Theme <a href="https://github.com/yscoder/hexo-theme-indigo" target="_blank">indigo</a>
                    </span>
                </font>
            </p>
        </div>
    </footer>
    </main>
    <div class="mask" id="mask"></div>
<a href="javascript:;" id="gotop" class="waves-effect waves-circle waves-light"><span class="icon icon-lg icon-chevron-up"></span></a>



<div class="global-share" id="globalShare">
    <ul class="reset share-icons">
      <li>
        <a class="weibo share-sns" target="_blank" href="http://service.weibo.com/share/share.php?url=http://yoursite.com/2019/07/10/Java爬虫之Jsoup/&title=《Java爬虫之Jsoup》 — 欢迎参观小灰灰的网站哟 ヾ(◍°∇°◍)ﾉﾞ ~&pic=http://yoursite.com/img/my-portrait.jpg" data-title="微博">
          <i class="icon icon-weibo"></i>
        </a>
      </li>
      <li>
        <a class="weixin share-sns wxFab" href="javascript:;" data-title="微信">
          <i class="icon icon-weixin"></i>
        </a>
      </li>
      <li>
        <a class="qq share-sns" target="_blank" href="http://connect.qq.com/widget/shareqq/index.html?url=http://yoursite.com/2019/07/10/Java爬虫之Jsoup/&title=《Java爬虫之Jsoup》 — 欢迎参观小灰灰的网站哟 ヾ(◍°∇°◍)ﾉﾞ ~&source=My Personal Website For Blog" data-title=" QQ">
          <i class="icon icon-qq"></i>
        </a>
      </li>
      <li>
        <a class="facebook share-sns" target="_blank" href="https://www.facebook.com/sharer/sharer.php?u=http://yoursite.com/2019/07/10/Java爬虫之Jsoup/" data-title=" Facebook">
          <i class="icon icon-facebook"></i>
        </a>
      </li>
      <li>
        <a class="twitter share-sns" target="_blank" href="https://twitter.com/intent/tweet?text=《Java爬虫之Jsoup》 — 欢迎参观小灰灰的网站哟 ヾ(◍°∇°◍)ﾉﾞ ~&url=http://yoursite.com/2019/07/10/Java爬虫之Jsoup/&via=http://yoursite.com" data-title=" Twitter">
          <i class="icon icon-twitter"></i>
        </a>
      </li>
      <li>
        <a class="google share-sns" target="_blank" href="https://plus.google.com/share?url=http://yoursite.com/2019/07/10/Java爬虫之Jsoup/" data-title=" Google+">
          <i class="icon icon-google-plus"></i>
        </a>
      </li>
    </ul>
 </div>


<div class="page-modal wx-share" id="wxShare">
    <a class="close" href="javascript:;"><i class="icon icon-close"></i></a>
    <p>扫一扫，分享到微信</p>
    <img src="//api.qrserver.com/v1/create-qr-code/?data=http://yoursite.com/2019/07/10/Java爬虫之Jsoup/" alt="微信分享二维码">
</div>




    <script src="//cdn.bootcss.com/node-waves/0.7.4/waves.min.js"></script>
<script>
var BLOG = { ROOT: '/', SHARE: true, REWARD: true };


</script>

<script src="//unpkg.com/hexo-theme-material-indigo@latest/js/main.min.js"></script>


<div class="search-panel" id="search-panel">
    <ul class="search-result" id="search-result"></ul>
</div>
<template id="search-tpl">
<li class="item">
    <a href="{path}" class="waves-block waves-effect">
        <div class="title ellipsis" title="{title}">{title}</div>
        <div class="flex-row flex-middle">
            <div class="tags ellipsis">
                {tags}
            </div>
            <time class="flex-col time">{date}</time>
        </div>
    </a>
</li>
</template>

<script src="//unpkg.com/hexo-theme-material-indigo@latest/js/search.min.js" async></script>








<script>
(function() {
    var OriginTitile = document.title, titleTime;
    document.addEventListener('visibilitychange', function() {
        if (document.hidden) {
            document.title = 'Where are you going ?';
            clearTimeout(titleTime);
        } else {
            document.title = 'As long as you love me ~';
            titleTime = setTimeout(function() {
                document.title = OriginTitile;
            },2000);
        }
    });
})();
</script>



</body>
</html>
